import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook" # Let's us display Plotly figs after export to html file
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format # display numbers as with thousands separator
pd.set_option('display.max_colwidth', None) # display full text in columns
from IPython.display import display, HTML
# Custom CSS to align DataFrame values to the right
css_rules = """
<style>
table.dataframe th, table.dataframe td {
text-align: right !important;
}
</style>
"""
# Apply the custom CSS
display(HTML(css_rules))
df = pd.read_csv('Data/clean_field_of_study.csv', dtype={'CIPFIELD': str, 'CIPCODE': str})
# Look at which "EARN" variables have the most datapoints
earn_col_counts = {}
for col in df.columns:
if "EARN" in col:
earn_col_counts[col] = len(df[col].unique()) -1 # subtract 1 to account for NaN
# sort dictionary by value to see which variables have the most data
earn_col_counts = {k: v for k, v in sorted(earn_col_counts.items(), key=lambda item: item[1], reverse=True)}
list(earn_col_counts.items())[:5] # Display first 5 items
[('EARN_MDN_HI_1YR', 264),
('EARN_MDN_1YR', 254),
('EARN_MDN_4YR', 244),
('EARN_PELL_WNE_MDN_1YR', 226),
('EARN_NE_MDN_3YR', 219)]
# We decide to us EARN_MDN_1YR and EARN_MDN_4YR
df = df[df.columns.to_list()[:7] + ['EARN_MDN_1YR', 'EARN_MDN_4YR', 'EARN_COUNT_WNE_1YR', 'EARN_COUNT_WNE_4YR']]
# Get counts of nans
# Number of rows where all four EARN columns have NaN values
all_earn_nan = df[df[['EARN_MDN_1YR', 'EARN_MDN_4YR', 'EARN_COUNT_WNE_1YR', 'EARN_COUNT_WNE_4YR']].isna().all(axis=1)].shape[0]
# Number of rows where each individual EARN column has NaN values
earn_mdn_1yr_nan = df['EARN_MDN_1YR'].isna().sum()
earn_mdn_4yr_nan = df['EARN_MDN_4YR'].isna().sum()
earn_count_wne_1yr_nan = df['EARN_COUNT_WNE_1YR'].isna().sum()
earn_count_wne_4yr_nan = df['EARN_COUNT_WNE_4YR'].isna().sum()
print(f'All four EARN columns nan count: {all_earn_nan}\n1YR median earnings nan count: {earn_mdn_1yr_nan}\n4YR median earnings nan count: {earn_mdn_4yr_nan}')
# Drop rows with nan's (we lose 310 total rows -- 262 of which all four columns are nan)
df.dropna(inplace=True)
All four EARN columns nan count: 262 1YR median earnings nan count: 299 4YR median earnings nan count: 310
# Group earnings by universities (summary stats of all programs with public record)
univ_1yr_stats = df.groupby('INSTNM').agg({'EARN_COUNT_WNE_1YR': 'sum','EARN_MDN_1YR': ['mean', 'min', 'max']}).reset_index()
univ_1yr_stats = univ_1yr_stats.sort_values(by=('EARN_MDN_1YR', 'mean'), ascending=False).reset_index(drop=True)
univ_1yr_stats
| INSTNM | EARN_COUNT_WNE_1YR | EARN_MDN_1YR | |||
|---|---|---|---|---|---|
| sum | mean | min | max | ||
| 0 | University of Utah | 3,280.00 | 47,227.86 | 25,918.00 | 85,652.00 |
| 1 | Brigham Young University | 3,652.00 | 47,174.15 | 17,223.00 | 86,469.00 |
| 2 | Westminster College | 155.00 | 46,325.40 | 31,263.00 | 59,998.00 |
| 3 | Weber State University | 2,198.00 | 46,260.09 | 17,779.00 | 77,117.00 |
| 4 | Utah Valley University | 3,134.00 | 44,682.09 | 22,648.00 | 82,864.00 |
| 5 | Utah State University | 3,427.00 | 39,905.87 | 21,265.00 | 67,878.00 |
| 6 | Southern Utah University | 525.00 | 35,300.25 | 19,031.00 | 58,924.00 |
# Group earnings by universities (summary stats of all programs with public record)
univ_4yr_stats = df.groupby('INSTNM').agg({'EARN_COUNT_WNE_4YR': 'sum','EARN_MDN_4YR': ['mean', 'min', 'max']}).reset_index()
univ_4yr_stats = univ_4yr_stats.sort_values(by=('EARN_MDN_4YR', 'mean'), ascending=False).reset_index(drop=True)
univ_4yr_stats
| INSTNM | EARN_COUNT_WNE_4YR | EARN_MDN_4YR | |||
|---|---|---|---|---|---|
| sum | mean | min | max | ||
| 0 | Brigham Young University | 3,543.00 | 63,373.25 | 23,498.00 | 127,592.00 |
| 1 | University of Utah | 1,560.00 | 61,987.43 | 37,398.00 | 99,562.00 |
| 2 | Westminster College | 196.00 | 60,817.40 | 50,739.00 | 67,444.00 |
| 3 | Utah Valley University | 2,730.00 | 57,600.16 | 16,331.00 | 103,513.00 |
| 4 | Weber State University | 2,142.00 | 55,969.97 | 32,470.00 | 89,974.00 |
| 5 | Utah State University | 2,921.00 | 52,193.71 | 25,367.00 | 91,862.00 |
| 6 | Southern Utah University | 475.00 | 44,503.17 | 31,561.00 | 62,156.00 |
fig = go.Figure()
univs = df['INSTNM'].unique().tolist() # Get list of unique universities
# Create Violin plot for each University
for univ in univs:
fig.add_trace(go.Violin(x=df['INSTNM'][df['INSTNM'] == univ],
y=df['EARN_MDN_4YR'][df['INSTNM'] == univ],
name=univ,
box_visible=True,
meanline_visible=True,
showlegend=False))
# Format the plot
fig.update_yaxes(tickformat="$,.0f")
fig.update_layout(
title="Earnings 4 Years after Completion Grouped by University",
yaxis_title="Earnings",
font=dict(
family="Courier New, monospace",
size=10,
color="RebeccaPurple"
)
)
fig.show()
fig = go.Figure()
univs = df['INSTNM'].unique().tolist() # Get list of unique universities
# Create Violin plot for each University
for univ in univs:
fig.add_trace(go.Violin(x=df['INSTNM'][df['INSTNM'] == univ],
y=df['EARN_MDN_1YR'][df['INSTNM'] == univ],
name=univ,
box_visible=True,
meanline_visible=True,
showlegend=False))
# Format the plot
fig.update_yaxes(tickformat="$,.0f")
fig.update_layout(
title="Earnings 1 Years after Completion Grouped by University",
yaxis_title="Earnings",
font=dict(
family="Courier New, monospace",
size=10,
color="RebeccaPurple"
)
)
fig.show()
Field of study defined by first two digits of CIPCODE
# Group earnings for each field of study (summary of all programs with public record)
field_1yr_stats = df.groupby('CIPDEF').agg({'EARN_COUNT_WNE_1YR': 'sum','EARN_MDN_1YR': ['mean', 'min', 'max']}).reset_index()
field_1yr_stats = field_1yr_stats.sort_values(by=('EARN_MDN_1YR', 'mean'), ascending=False).reset_index(drop=True)
field_1yr_stats
| CIPDEF | EARN_COUNT_WNE_1YR | EARN_MDN_1YR | |||
|---|---|---|---|---|---|
| sum | mean | min | max | ||
| 0 | ENGINEERING. | 1,053.00 | 69,412.13 | 60,203.00 | 82,864.00 |
| 1 | COMPUTER AND INFORMATION SCIENCES AND SUPPORT SERVICES. | 1,047.00 | 69,317.67 | 37,101.00 | 86,469.00 |
| 2 | CONSTRUCTION TRADES. | 55.00 | 68,886.00 | 68,886.00 | 68,886.00 |
| 3 | ENGINEERING/ENGINEERING-RELATED TECHNOLOGIES/TECHNICIANS. | 79.00 | 65,609.00 | 62,141.00 | 69,077.00 |
| 4 | MATHEMATICS AND STATISTICS. | 94.00 | 57,342.50 | 56,462.00 | 58,223.00 |
| 5 | BUSINESS, MANAGEMENT, MARKETING, AND RELATED SUPPORT SERVICES. | 2,807.00 | 55,107.69 | 33,485.00 | 74,883.00 |
| 6 | TRANSPORTATION AND MATERIALS MOVING. | 212.00 | 50,560.50 | 44,128.00 | 56,993.00 |
| 7 | PHYSICAL SCIENCES. | 78.00 | 50,117.33 | 38,443.00 | 65,065.00 |
| 8 | COMMUNICATIONS TECHNOLOGIES/TECHNICIANS AND SUPPORT SERVICES. | 24.00 | 48,703.00 | 48,703.00 | 48,703.00 |
| 9 | HEALTH PROFESSIONS AND RELATED PROGRAMS. | 2,074.00 | 46,428.11 | 27,319.00 | 63,701.00 |
| 10 | HOMELAND SECURITY, LAW ENFORCEMENT, FIREFIGHTING AND RELATED PROTECTIVE SERVICES. | 305.00 | 44,299.50 | 36,865.00 | 56,578.00 |
| 11 | ARCHITECTURE AND RELATED SERVICES. | 60.00 | 44,067.50 | 39,464.00 | 48,671.00 |
| 12 | EDUCATION. | 1,295.00 | 43,397.00 | 33,949.00 | 57,508.00 |
| 13 | SOCIAL SCIENCES. | 953.00 | 40,257.64 | 26,905.00 | 56,599.00 |
| 14 | FOREIGN LANGUAGES, LITERATURES, AND LINGUISTICS. | 255.00 | 38,294.57 | 29,046.00 | 49,724.00 |
| 15 | AGRICULTURAL/ANIMAL/PLANT/VETERINARY SCIENCE AND RELATED FIELDS. | 108.00 | 37,836.50 | 30,278.00 | 45,395.00 |
| 16 | LIBERAL ARTS AND SCIENCES, GENERAL STUDIES AND HUMANITIES. | 291.00 | 36,959.00 | 30,086.00 | 42,698.00 |
| 17 | COMMUNICATION, JOURNALISM, AND RELATED PROGRAMS. | 900.00 | 36,397.67 | 25,858.00 | 46,269.00 |
| 18 | PUBLIC ADMINISTRATION AND SOCIAL SERVICE PROFESSIONS. | 213.00 | 34,515.00 | 30,124.00 | 38,055.00 |
| 19 | AREA, ETHNIC, CULTURAL, GENDER, AND GROUP STUDIES. | 95.00 | 33,436.50 | 33,213.00 | 33,660.00 |
| 20 | PARKS, RECREATION, LEISURE, FITNESS, AND KINESIOLOGY. | 580.00 | 32,997.71 | 23,297.00 | 45,748.00 |
| 21 | MULTI/INTERDISCIPLINARY STUDIES. | 152.00 | 32,400.33 | 28,338.00 | 37,755.00 |
| 22 | ENGLISH LANGUAGE AND LITERATURE/LETTERS. | 423.00 | 32,389.29 | 28,928.00 | 41,726.00 |
| 23 | HISTORY. | 160.00 | 31,192.00 | 28,541.00 | 33,397.00 |
| 24 | PSYCHOLOGY. | 1,019.00 | 29,626.17 | 24,986.00 | 32,569.00 |
| 25 | BIOLOGICAL AND BIOMEDICAL SCIENCES. | 653.00 | 29,471.78 | 20,357.00 | 38,438.00 |
| 26 | VISUAL AND PERFORMING ARTS. | 765.00 | 28,146.20 | 17,223.00 | 39,217.00 |
| 27 | FAMILY AND CONSUMER SCIENCES/HUMAN SCIENCES. | 621.00 | 26,417.86 | 19,031.00 | 32,998.00 |
# Group earnings for each field of study (summary of all programs with public record)
field_4yr_stats = df.groupby('CIPDEF').agg({'EARN_COUNT_WNE_4YR': 'sum','EARN_MDN_4YR': ['mean', 'min', 'max']}).reset_index()
field_4yr_stats = field_4yr_stats.sort_values(by=('EARN_MDN_4YR', 'mean'), ascending=False).reset_index(drop=True)
field_4yr_stats
| CIPDEF | EARN_COUNT_WNE_4YR | EARN_MDN_4YR | |||
|---|---|---|---|---|---|
| sum | mean | min | max | ||
| 0 | COMPUTER AND INFORMATION SCIENCES AND SUPPORT SERVICES. | 826.00 | 88,258.00 | 48,538.00 | 114,046.00 |
| 1 | ENGINEERING. | 855.00 | 87,750.60 | 73,089.00 | 127,592.00 |
| 2 | MATHEMATICS AND STATISTICS. | 59.00 | 79,298.00 | 69,595.00 | 89,001.00 |
| 3 | CONSTRUCTION TRADES. | 37.00 | 78,807.00 | 78,807.00 | 78,807.00 |
| 4 | ENGINEERING/ENGINEERING-RELATED TECHNOLOGIES/TECHNICIANS. | 104.00 | 78,380.50 | 72,469.00 | 84,292.00 |
| 5 | COMMUNICATIONS TECHNOLOGIES/TECHNICIANS AND SUPPORT SERVICES. | 23.00 | 74,650.00 | 74,650.00 | 74,650.00 |
| 6 | BUSINESS, MANAGEMENT, MARKETING, AND RELATED SUPPORT SERVICES. | 2,427.00 | 72,721.94 | 50,174.00 | 97,349.00 |
| 7 | TRANSPORTATION AND MATERIALS MOVING. | 230.00 | 70,107.50 | 59,224.00 | 80,991.00 |
| 8 | PHYSICAL SCIENCES. | 59.00 | 69,189.33 | 61,624.00 | 76,268.00 |
| 9 | ARCHITECTURE AND RELATED SERVICES. | 45.00 | 62,267.00 | 61,739.00 | 62,795.00 |
| 10 | SOCIAL SCIENCES. | 729.00 | 56,863.71 | 30,608.00 | 81,529.00 |
| 11 | HOMELAND SECURITY, LAW ENFORCEMENT, FIREFIGHTING AND RELATED PROTECTIVE SERVICES. | 381.00 | 55,988.75 | 43,401.00 | 70,785.00 |
| 12 | BIOLOGICAL AND BIOMEDICAL SCIENCES. | 468.00 | 54,691.33 | 35,147.00 | 73,566.00 |
| 13 | FOREIGN LANGUAGES, LITERATURES, AND LINGUISTICS. | 291.00 | 53,570.57 | 33,074.00 | 66,679.00 |
| 14 | HEALTH PROFESSIONS AND RELATED PROGRAMS. | 1,681.00 | 52,063.72 | 23,498.00 | 68,640.00 |
| 15 | AREA, ETHNIC, CULTURAL, GENDER, AND GROUP STUDIES. | 87.00 | 50,107.00 | 44,503.00 | 55,711.00 |
| 16 | COMMUNICATION, JOURNALISM, AND RELATED PROGRAMS. | 664.00 | 49,233.00 | 42,911.00 | 58,056.00 |
| 17 | AGRICULTURAL/ANIMAL/PLANT/VETERINARY SCIENCE AND RELATED FIELDS. | 64.00 | 47,654.50 | 38,971.00 | 56,338.00 |
| 18 | MULTI/INTERDISCIPLINARY STUDIES. | 196.00 | 46,442.00 | 39,623.00 | 56,942.00 |
| 19 | PUBLIC ADMINISTRATION AND SOCIAL SERVICE PROFESSIONS. | 244.00 | 45,043.00 | 39,435.00 | 47,214.00 |
| 20 | EDUCATION. | 1,189.00 | 44,786.85 | 36,162.00 | 49,647.00 |
| 21 | LIBERAL ARTS AND SCIENCES, GENERAL STUDIES AND HUMANITIES. | 256.00 | 44,328.25 | 32,839.00 | 51,593.00 |
| 22 | HISTORY. | 140.00 | 43,957.00 | 36,231.00 | 54,987.00 |
| 23 | PARKS, RECREATION, LEISURE, FITNESS, AND KINESIOLOGY. | 347.00 | 42,995.57 | 25,700.00 | 56,514.00 |
| 24 | ENGLISH LANGUAGE AND LITERATURE/LETTERS. | 419.00 | 42,187.71 | 38,592.00 | 46,995.00 |
| 25 | PSYCHOLOGY. | 693.00 | 39,981.00 | 31,561.00 | 44,664.00 |
| 26 | VISUAL AND PERFORMING ARTS. | 628.00 | 35,614.53 | 16,331.00 | 47,300.00 |
| 27 | FAMILY AND CONSUMER SCIENCES/HUMAN SCIENCES. | 425.00 | 34,970.86 | 24,823.00 | 44,984.00 |
fig = go.Figure()
fields = df['CIPFIELD'].unique().tolist() # Get list of unique fields of study
# Create violin plot for each field of study
for field in fields:
fig.add_trace(go.Violin(x=df['CIPFIELD'][df['CIPFIELD'] == field],
y=df['EARN_MDN_4YR'][df['CIPFIELD'] == field],
name=field,
box_visible=True,
meanline_visible=True,
showlegend=False))
# Format the plot
fig.update_yaxes(tickformat="$,.0f")
fig.update_layout(
title="Earnings 4 Years after Completion Grouped by Field of Study",
yaxis_title="Earnings",
font=dict(
family="Courier New, monospace",
size=10,
color="RebeccaPurple"
)
)
fig.show()
fig = go.Figure()
fields = df['CIPFIELD'].unique().tolist() # Get list of unique fields of study
# Create violin plot for each field of study
for field in fields:
fig.add_trace(go.Violin(x=df['CIPFIELD'][df['CIPFIELD'] == field],
y=df['EARN_MDN_1YR'][df['CIPFIELD'] == field],
name=field,
box_visible=True,
meanline_visible=True,
showlegend=False))
# Format the plot
fig.update_yaxes(tickformat="$,.0f")
fig.update_layout(
title="Earnings 1 Year after Completion Grouped by Field of Study",
yaxis_title="Earnings",
font=dict(
family="Courier New, monospace",
size=10,
color="RebeccaPurple"
)
)
fig.show()
# Group earnings for each field of study for each university
univ_field_1yr = df.groupby(['INSTNM', 'CIPDEF']).agg({'EARN_COUNT_WNE_1YR': 'sum','EARN_MDN_1YR': ['mean', 'min', 'max']}).reset_index()
univ_field_1yr.to_excel('Data/univ_field_1yr.xlsx')
# Group earnings for each field of study for each university
univ_field_4yr = df.groupby(['INSTNM', 'CIPDEF']).agg({'EARN_COUNT_WNE_4YR': 'sum','EARN_MDN_4YR': ['mean', 'min', 'max']}).reset_index()
univ_field_4yr.to_excel('Data/univ_field_4yr.xlsx')